2024 VIS Area Curation Committee Executive Summary
Quantitative Analysis
Code
import itertoolsimport pandas as pdimport numpy as np# Import the necessaries librariesimport plotly.offline as pioimport plotly.graph_objs as goimport plotly.express as px# [jdf] no need to specify the renderer but, for interactive use, init_notebook should be called# pio.renderers.default = "jupyterlab"# Set notebook mode to work in offline# pio.init_notebook_mode()# pio.init_notebook_mode(connected=True)width =750import sqlite3#### Data Preparation# static data – codes -> names etc.staticdata =dict( decision = { 'C': 'Confer vs. cond Accept', # relevant for the 2020 and 2021 data have a different meaning'A': 'Accept', # for the 2020 data'A2': 'Accept', # after the second round, should be 120 in 2022'R': 'Reject', # reject after the first round -- should be 322 in 2022'R2': 'Reject in round 2', # reject after the second round -- should be 2 in 2022'R-2nd': 'Reject in round 2', 'DR-S': 'Desk Reject (Scope)', # should be 7 in 2022'DR-P': 'Desk Reject (Plagiarism)', # should be 4 in 2022'AR-P': 'Admin Reject (Plagiarism)', # should be 1 in 2022'DR-F': 'Desk Reject (Format)', # should be 4 in 2022'R-Strong': 'Reject Strong', # cannot resubmit to TVCG for a year'T': 'Reject TVCG fasttrack', # Explicitly invited to resubmit to TVCG, status in major revision }, FinalDecision = { # Just flatten to Accept and Reject'C': 'Accept', 'A': 'Accept', # for the 2020 data'A2': 'Accept', # after the second round, should be 120 in 2022'R': 'Reject', # reject after the first round -- should be 322 in 2022'R2': 'Reject', # reject after the second round -- should be 2 in 2022'R-2nd': 'Reject', 'DR-S': 'Reject', # should be 7 in 2022'DR-P': 'Reject', # should be 4 in 2022'AR-P': 'Reject', # should be 1 in 2022'DR-F': 'Reject', # should be 4 in 2022'R-Strong': 'Reject','T': 'Reject', }, area = {'T&E': 'Theoretical & Empirical','App': 'Applications','S&R': 'Systems & Rendering','R&I': 'Representations & Interaction','DTr': 'Data Transformations','A&D': 'Analytics & Decisions', }, bid = { 0: 'no bid',1: 'want',2: 'willing',3: 'reluctant',4: 'conflict' }, stat = {'Prim': 'Primary', 'Seco': 'Secondary' }, keywords = pd.read_csv("../data/2021/keywords.csv", sep=';'), # 2021 is correct as there was no new keywords file in 2022 colnames = {'confsubid': 'Paper ID','rid': 'Reviewer','decision': 'Decision','area': 'Area','stat': 'Role','bid': 'Bid' })dbcon = sqlite3.connect('../data/vis-area-chair.db') #[jdf] assume data is in ..submissions_raw20 = pd.read_sql_query('SELECT * from submissions WHERE year = 2020', dbcon, 'sid')submissions_raw21 = pd.read_sql_query('SELECT * from submissions WHERE year = 2021', dbcon, 'sid')submissions_raw22 = pd.read_sql_query('SELECT * from submissions WHERE year = 2022', dbcon, 'sid')submissions_raw23 = pd.read_sql_query('SELECT * from submissions WHERE year = 2023', dbcon, 'sid')submissions_raw24 = pd.read_sql_query('SELECT * from submissions WHERE year = 2024', dbcon, 'sid')submissions_raw = pd.read_sql_query('SELECT * from submissions', dbcon, 'sid')#print(submissions_raw24)submissions = (submissions_raw .join( pd.read_sql_query('SELECT * from areas', dbcon, 'aid'), on='aid' ) .assign(Keywords =lambda df: (pd .read_sql_query('SELECT * FROM submissionkeywords', dbcon, 'sid') .loc[df.index] .join( pd.read_sql_query('SELECT * FROM keywords', dbcon, 'kid'), on='kid' ) .keyword .groupby('sid') .apply(list) )) .assign(**{'# Keywords': lambda df: df.Keywords.apply(len)}) .assign(**{'FinalDecision': lambda df: df['decision']}) .replace(staticdata) .rename(columns = staticdata['colnames']) .drop(columns = ['legacy', 'aid'])# .set_index('sid')# .set_index('Paper ID')# note -- I changed the index, since 'Paper ID' was not unique for multiple years.# By not setting the index to 'Paper ID' the index remains with 'sid'.# However, 'sid' is used as a unique index in the creation of the database anyways.)# replace the old 'Paper ID' with a unique identifier, so that the code from 2021 will worksubmissions = submissions.rename(columns = {'Paper ID':'Old Paper ID'})submissions.reset_index(inplace=True)submissions['Paper ID'] = submissions['sid']submissions = submissions.set_index('Paper ID')#submissions colums: (index), sid (unique id), Paper ID (unique), Old Paper ID, Decision, year, Area, Keywords (as a list), # Keywordsall_years = submissions['year'].unique()#rates_decision computes the acceptance rates (and total number of papers) per year#rates_decision: (index), Decision, year, count, Percentagerates_decision = (submissions .value_counts(['Decision', 'year']) .reset_index()# .rename(columns = {0: 'count'}))rates_decision['Percentage'] = rates_decision.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)rates_decision = rates_decision.round({'Percentage': 1})#rates_decision computes the acceptance rates (and total number of papers) per year#rates_decision: (index), Decision, year, count, Percentagerates_decision_final = (submissions .value_counts(['FinalDecision', 'year']) .reset_index()# .rename(columns = {0: 'count'}))rates_decision_final['Percentage'] = rates_decision_final.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)rates_decision_final = rates_decision_final.round({'Percentage': 1})#submissions#bids_raw: (index), Reviewer ID, sid (unique paper identifier over mult years), match score, bid of the reviewer, role of the reviewer, Paper IDbids_raw = (pd .read_sql_query('SELECT * from reviewerbids', dbcon) .merge(submissions_raw['confsubid'], on='sid') .replace(staticdata) .rename(columns = staticdata['colnames']))#bids_raw## Renaming Paper ID to Old Paper ID, setting Paper ID to sid, keeping all 3 for now...bids_raw = bids_raw.rename(columns = {'Paper ID':'Old Paper ID'})bids_raw['Paper ID'] = bids_raw['sid']# bids = Reviewer, sid, Bid (how the reviewer bid on this paper)# doesn't include review/sid that were not bid for [.query('Bid != "no bid"')]bids = (bids_raw .query('Bid != "no bid"')# Paper ID is not unique over multiple years!# .drop(columns = ['sid'])# [['Reviewer','Paper ID', 'Bid']] [['Reviewer','sid', 'Paper ID', 'Bid']] .reset_index(drop =True))# matchscores becomes a table to reviewer/sid with the match scores# many of these will be "NaN" since we now have multiple years together.# we need to check whether the reviewer IDs remain unique across the years!matchscores = (bids_raw# Paper ID is not unique over multiple years!# [['Reviewer','Paper ID','match']] [['Reviewer','sid','Paper ID','match']]# Paper ID is not unique over multiple years!# .set_index(['Reviewer', 'Paper ID']) .set_index(['Reviewer', 'Paper ID']) .match .unstack(level=1))# assignments = Reviewer, sid, Role (primary, secondary)# doesn't include review/sid that were not assigned [.query('Role != ""')]assignments = (bids_raw .query('Role != ""')# Paper ID is not unique over multiple years!# [['Reviewer', 'Paper ID', 'Role']] [['Reviewer', 'sid', 'Paper ID', 'Role']] .reset_index(drop =True))del dbcon#### Plot Defaultsacc_template = go.layout.Template()acc_template.layout =dict( font =dict( family='Fira Sans', color ='black', size =13 ), title_font_size =14, plot_bgcolor ='rgba(255,255,255,0)', paper_bgcolor ='rgba(255,255,255,0)', margin =dict(pad=10), xaxis =dict( title =dict( font =dict( family='Fira Sans Medium', size=13 ), standoff =10 ), gridcolor='lightgray', gridwidth=1, automargin =True, fixedrange =True, ), yaxis =dict( title =dict( font =dict( family='Fira Sans Medium', size=13 ), standoff =10, ), gridcolor='lightgray', gridwidth=1, automargin =True, fixedrange =True, ), legend=dict( title_font_family="Fira Sans Medium", ), colorway = px.colors.qualitative.T10, hovermode ='closest', hoverlabel=dict( bgcolor="white", bordercolor='lightgray', font_color ='black', font_family ='Fira Sans' ),)acc_template.data.bar = [dict( textposition ='inside', insidetextanchor='middle', textfont_size =12,)]px.defaults.template = acc_templatepx.defaults.category_orders = {'Decision': list(staticdata['decision'].values()),'FinalDecision': list(staticdata['FinalDecision'].values()),'Area': list(staticdata['area'].values()),'Short Name': staticdata['keywords']['Short Name'].tolist(),}config =dict( displayModeBar =False, scrollZoom =False, responsive =False)def aspect(ratio):return { 'width': width, 'height': int(ratio*width) }# useful data sub-products#k_all columns: (index), Paper ID, Old Paper ID, Decision, year, Area, Keywords (as a list), # Keywords, Keyword, Category, Subcategory, Short Name, Descriptionk_all = (submissions .join(submissions['Keywords'] .explode() .rename('Keyword') ) .reset_index(level =0) .merge(staticdata['keywords'], on='Keyword'))# (Old) Paper ID is not unique, however, the 'sid' is (which is the current index)#k_all.reset_index(inplace=True)#k_all.rename(columns = {'sid':'Paper ID'},inplace = True)#k_all = k_all.merge(staticdata['keywords'], on='Keyword')#k_all#k_total columns: Category, Subcategory, Short Name, Keyword, Description, #Submissions, year# counts the total number of submissions per keyword and yeark_total = staticdata['keywords'].merge( k_all.value_counts(['Short Name','year']) .rename('# Submissions') .reset_index(),# on = 'Short Name', how ='right'# how = 'outer')#k_cnt: how often was a particular keyword used among all submissions within a year????#k_cnt columns: (index), Short Name, year, c, Category, Subcategory, Keyword, Description# not clear how k_cnt and k_total differ!k_cnt = (k_all .value_counts(['Short Name','year'], sort=False) .rename('c') .to_frame() .reset_index() .merge(staticdata['keywords'], on='Short Name'))
Deeper data investigation
This report is generated by members of the ACC for the current year, and prepared for the VSC. Upon review, it will be linked from the IEEE VIS website. The conclusions and discussion points are based on submission and reviewer data from IEEE VIS 2024 (and previous years). The report and analysis performed is focused on the use of keywords, areas, and reviewer matching. Thus, there are likely other aspects of conference organization which are not covered (but could be considered).
The report is broken down into the following sections. After the summary at the beginning, the data and analysis process is described. It shows which data we used, where it is stored, and how it is obtained. These processes can be adapted for future years of this committee.
(NB: Some of the plots shown above are repeated here from the highlights for the sake of completeness.)
Data and Process
We analyzed anonymized data containing information about the full paper submissions to VIS 2024, the reviews of these submissions, and the IPC bidding preferences. We analyzed this data to understand how well the areas and keywords characterize the body of work submitted this year. We also analyzed the IPC bidding information to understand how well the expertise of the IPC members covers the submissions. Below, we show highlights of our findings.
Note that in the the analysis that follows, the submission/paper IDs and reviewer IDs are anonymized through a randomizer, and are not the IDs used in PCS submissions and reviewers.
The data used to perform this analysis is a combination of paper submission data and reviewer bidding data. Both sets were anonymized to minimize the ability to identify IPC members, authors, or reviewers. The analysis of the data in this year uses the anonymized CSV files obtained directly from PCS. You can see the source code used to process and generate the plots in this document by clicking on the “Code” buttons, which will fold out the Python code used. The anonymization script that was used is located in the anonymization-scripts folder (and may be needed to be updated to correspond with changes made in PCS). In order to get ALL the data, it is current run by James at PCS who sends the resultant anonymized files to the ocmmittee where they are stored in the corresponding year folder.
In order to facilitate longitudinal studies of this data, we are also providing a sqlite database with the 2020, 2021, and 2022 data in an attempt to make it easier to incorporate future years. This database (as well as the source code of this document) can be found here
Sanity Checks
We include some sanity checks on the data in order to make sure the data has been processed correctly. In 2024, we should have:
139 papers accepted after the second round
236 papers rejected after the first round
9 papers desk rejected
Code
#rates_decision computes the acceptance rates (and total number of papers) per year#rates_decision: (index), Decision, year, count, Percentagerates_decision = (submissions .value_counts(['Decision', 'year']) .reset_index()# .rename(columns = {0: 'count'}) #[jdf] no need to rename, the count is already in the 'count' attribute.)rates_decision['Percentage'] = rates_decision.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)rates_decision = rates_decision.round({'Percentage': 1})#rates_decision computes the acceptance rates (and total number of papers) per year#rates_decision: (index), Decision, year, count, Percentagerates_decision_final = (submissions .value_counts(['FinalDecision', 'year']) .reset_index()# .rename(columns = {0: 'count'}) #[jdf] no need to rename, the count is already in the 'count' attribute.)rates_decision_final['Percentage'] = rates_decision_final.groupby(['year'])['count'].transform(lambda x: x/x.sum()*100)rates_decision_final = rates_decision_final.round({'Percentage': 1})#| output: truerates_decision_final.sort_values(by=['year', 'FinalDecision'], ascending=[False, True], ignore_index=True)fig = px.bar(rates_decision, x ='count', y ='year', barmode ='stack', orientation ='h', color ='Decision', text ='count', custom_data = ['Decision'],).update_layout( yaxis=dict(autorange="reversed"), title ='Submissions', xaxis_title ='Number of Submissions',**aspect(0.45)).update_traces( hovertemplate ='%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',).show(config=config)fig = px.bar(rates_decision, x ='Percentage', y ='year', barmode ='stack', orientation ='h', color ='Decision', text ='Percentage', custom_data = ['Decision','count'],).update_layout( yaxis=dict(autorange="reversed"), title ='Submissions', xaxis_title ='Percentage of Submissions',**aspect(0.45)).update_traces( hovertemplate ='%{customdata[1]} submissions in %{y} have decision %{customdata[0]}<extra></extra>',).show(config=config)
Submissions per Area.
We wanted to understand how submissions were distributed by area, including acceptance decisions. Submissions to each area were within reasonable upper and lower limits, and decisions did not appear partial to any individual area.
Code
def group_stat(g):return pd.DataFrame({'# Submissions': g,'% Submissions': round(g/g.sum()*100,1),'Total': g.sum() })tmp = (submissions[submissions.year >2020] .value_counts(['Area', 'Decision', 'year']) .reset_index() .rename(columns = {0: 'count'}))fig = px.bar(tmp, x ='count', y ='Area', barmode ='stack', orientation ='h', color ='Decision', text ='count', custom_data = ['Decision'], facet_row='year', category_orders={'year': [2024,2023,2022, 2021]}, #, 2020]},).update_layout( title ='Submissions by area and year', xaxis_title ='Number of Submissions', yaxis=dict( autorange="reversed", tickfont=dict(size=12), # Adjust y-label fontsize ),**aspect(1.3)).update_traces( hovertemplate ='%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>', texttemplate='%{text}', textangle=0# Force labels to have horizontal orientation).show(config=config)fig = px.bar(tmp, x ='count', y ='Area', barmode ='stack', orientation ='h', color ='Decision', text ='count', custom_data = ['Decision'],).update_layout( title ='Submissions by area all years', xaxis_title ='Number of Submissions all years', yaxis=dict( autorange="reversed", tickfont=dict(size=12), # Adjust y-label fontsize ),**aspect(0.5)).update_traces( hovertemplate ='%{x} submissions in %{y} have decision %{customdata[0]}<extra></extra>',).show(config=config)
Submissions and Keywords used
Keywords with strong variations between 2023 and 2024 are listed here, with their historical differences. Despite the variations, there is no strong trend over the four years. Mostly yearly variations.
# do a manual histogram to include non-specified keywordspx.bar(k_total, x ='Short Name', y ='# Submissions', color ='Category', facet_row='year', category_orders={'year': reversed([2020, 2021, 2022, 2023, 2024])},).update_traces( hovertemplate ="'%{x}' specified in %{y} submissions<extra></extra>",).update_layout( xaxis_tickfont_size =8, xaxis_dtick =1, yaxis_dtick =50, hovermode ='closest', title ='Frequency of keywords across submissions',**aspect(0.8)).show(config=config)
How are keywords distributed across areas?
Code
# do a manual histogram to include non-specified keywordsk_cnt = staticdata['keywords'].merge( pd.DataFrame(staticdata['area'].values(), columns = ['Area']), how ='cross').merge( k_all .value_counts(['Short Name', 'Area']) .rename('# Submissions') .reset_index(), how ='outer').fillna(1e-10) # needed for sorting, Plotly bug?px.bar(k_cnt, x ='Short Name', y ='# Submissions', color ='Area', custom_data = ['Area']).update_traces( hovertemplate ='Keyword "%{x}" specified by %{y} submissions from area "%{customdata}"<extra></extra>').update_layout( barmode ='stack', xaxis_dtick =1, xaxis_tickfont_size =8, xaxis_fixedrange =True, yaxis_fixedrange =True, xaxis_categoryorder ='total descending', title ='Frequency of keywords across submissions, by area',**aspect(0.5)).show(config=config)
How many submissions specified a given number of keywords?
Code
tmp = (submissions .value_counts(['# Keywords', 'Area']) .rename('# Submissions') .reset_index())px.bar(tmp, x ='# Keywords', y ='# Submissions', barmode ='stack', color ='Area', custom_data=['Area'],).update_traces( hovertemplate ='%{y} submissions specified %{x} keywords in area "%{customdata}"<extra></extra>',).update_layout( xaxis_dtick =1, title ='Keyword count per submission',**aspect(0.5)).show(config=config)
How often are pairs of keywords specified together?